Let us pull the dataset from the file spotify_dataset.csv into a dataframe df_spotify. As this file is huge in size, we cannot upload it on GitHub. Thus, we are storing the file in a folder called spotify_dataset that is located in one folder above the project folder.

df_spotify <- data.frame(read.csv('spotify_dataset.csv'))
str(df_spotify)
## 'data.frame':    114000 obs. of  21 variables:
##  $ X               : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ track_id        : chr  "5SuOikwiRyPMVoIQDJUgSV" "4qPNDBW1i3p13qLCt0Ki3A" "1iJBSr7s7jYXzM8EGcbK5b" "6lfxq3CG4xtTiEg7opyCyx" ...
##  $ artists         : chr  "Gen Hoshino" "Ben Woodward" "Ingrid Michaelson;ZAYN" "Kina Grannis" ...
##  $ album_name      : chr  "Comedy" "Ghost (Acoustic)" "To Begin Again" "Crazy Rich Asians (Original Motion Picture Soundtrack)" ...
##  $ track_name      : chr  "Comedy" "Ghost - Acoustic" "To Begin Again" "Can't Help Falling In Love" ...
##  $ popularity      : int  73 55 57 71 82 58 74 80 74 56 ...
##  $ duration_ms     : int  230666 149610 210826 201933 198853 214240 229400 242946 189613 205594 ...
##  $ explicit        : chr  "False" "False" "False" "False" ...
##  $ danceability    : num  0.676 0.42 0.438 0.266 0.618 0.688 0.407 0.703 0.625 0.442 ...
##  $ energy          : num  0.461 0.166 0.359 0.0596 0.443 0.481 0.147 0.444 0.414 0.632 ...
##  $ key             : int  1 1 0 0 2 6 2 11 0 1 ...
##  $ loudness        : num  -6.75 -17.23 -9.73 -18.52 -9.68 ...
##  $ mode            : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ speechiness     : num  0.143 0.0763 0.0557 0.0363 0.0526 0.105 0.0355 0.0417 0.0369 0.0295 ...
##  $ acousticness    : num  0.0322 0.924 0.21 0.905 0.469 0.289 0.857 0.559 0.294 0.426 ...
##  $ instrumentalness: num  1.01e-06 5.56e-06 0.00 7.07e-05 0.00 0.00 2.89e-06 0.00 0.00 4.19e-03 ...
##  $ liveness        : num  0.358 0.101 0.117 0.132 0.0829 0.189 0.0913 0.0973 0.151 0.0735 ...
##  $ valence         : num  0.715 0.267 0.12 0.143 0.167 0.666 0.0765 0.712 0.669 0.196 ...
##  $ tempo           : num  87.9 77.5 76.3 181.7 119.9 ...
##  $ time_signature  : int  4 4 4 3 4 4 3 4 4 4 ...
##  $ track_genre     : chr  "acoustic" "acoustic" "acoustic" "acoustic" ...
head(df_spotify)
##   X               track_id                artists
## 1 0 5SuOikwiRyPMVoIQDJUgSV            Gen Hoshino
## 2 1 4qPNDBW1i3p13qLCt0Ki3A           Ben Woodward
## 3 2 1iJBSr7s7jYXzM8EGcbK5b Ingrid Michaelson;ZAYN
## 4 3 6lfxq3CG4xtTiEg7opyCyx           Kina Grannis
## 5 4 5vjLSffimiIP26QG5WcN2K       Chord Overstreet
## 6 5 01MVOl9KtVTNfFiBU9I7dc           Tyrone Wells
##                                               album_name
## 1                                                 Comedy
## 2                                       Ghost (Acoustic)
## 3                                         To Begin Again
## 4 Crazy Rich Asians (Original Motion Picture Soundtrack)
## 5                                                Hold On
## 6                                   Days I Will Remember
##                   track_name popularity duration_ms explicit danceability
## 1                     Comedy         73      230666    False        0.676
## 2           Ghost - Acoustic         55      149610    False        0.420
## 3             To Begin Again         57      210826    False        0.438
## 4 Can't Help Falling In Love         71      201933    False        0.266
## 5                    Hold On         82      198853    False        0.618
## 6       Days I Will Remember         58      214240    False        0.688
##   energy key loudness mode speechiness acousticness instrumentalness liveness
## 1 0.4610   1   -6.746    0      0.1430       0.0322         1.01e-06   0.3580
## 2 0.1660   1  -17.235    1      0.0763       0.9240         5.56e-06   0.1010
## 3 0.3590   0   -9.734    1      0.0557       0.2100         0.00e+00   0.1170
## 4 0.0596   0  -18.515    1      0.0363       0.9050         7.07e-05   0.1320
## 5 0.4430   2   -9.681    1      0.0526       0.4690         0.00e+00   0.0829
## 6 0.4810   6   -8.807    1      0.1050       0.2890         0.00e+00   0.1890
##   valence   tempo time_signature track_genre
## 1   0.715  87.917              4    acoustic
## 2   0.267  77.489              4    acoustic
## 3   0.120  76.332              4    acoustic
## 4   0.143 181.740              3    acoustic
## 5   0.167 119.949              4    acoustic
## 6   0.666  98.017              4    acoustic
tail(df_spotify)
##             X               track_id          artists
## 113995 113994 4WbOUe6T0sozC7z5ZJgiAA   Lucas Cervetti
## 113996 113995 2C3TZjDRiAzdyViavDJ217    Rainy Lullaby
## 113997 113996 1hIz5L4IB9hN3WRYPOCGPw    Rainy Lullaby
## 113998 113997 6x8ZfSoqDjuNa5SVP5QjvX    Cesária Evora
## 113999 113998 2e6sXL2bYv4bSz6VTdnfLs Michael W. Smith
## 114000 113999 2hETkH7cOfqmz3LqZDHZf5    Cesária Evora
##                                                                             album_name
## 113995                                                    Frecuencias Álmicas en 432hz
## 113996 #mindfulness - Soft Rain for Mindful Meditation, Stress Relief Relaxation Music
## 113997 #mindfulness - Soft Rain for Mindful Meditation, Stress Relief Relaxation Music
## 113998                                                                         Best Of
## 113999                                                               Change Your World
## 114000                                                                  Miss Perfumado
##                      track_name popularity duration_ms explicit danceability
## 113995 Frecuencia Álmica, Pt. 4         22      305454    False        0.331
## 113996      Sleep My Little Boy         21      384999    False        0.172
## 113997         Water Into Light         22      385000    False        0.174
## 113998           Miss Perfumado         22      271466    False        0.629
## 113999                  Friends         41      283893    False        0.587
## 114000                Barbincor         22      241826    False        0.526
##        energy key loudness mode speechiness acousticness instrumentalness
## 113995  0.171   1  -15.668    1      0.0350        0.920           0.0229
## 113996  0.235   5  -16.393    1      0.0422        0.640           0.9280
## 113997  0.117   0  -18.318    0      0.0401        0.994           0.9760
## 113998  0.329   0  -10.895    0      0.0420        0.867           0.0000
## 113999  0.506   7  -10.889    1      0.0297        0.381           0.0000
## 114000  0.487   1  -10.204    0      0.0725        0.681           0.0000
##        liveness valence   tempo time_signature track_genre
## 113995   0.0679  0.3270 132.147              3 world-music
## 113996   0.0863  0.0339 125.995              5 world-music
## 113997   0.1050  0.0350  85.239              4 world-music
## 113998   0.0839  0.7430 132.378              4 world-music
## 113999   0.2700  0.4130 135.960              4 world-music
## 114000   0.0893  0.7080  79.198              4 world-music
summary(df_spotify)
##        X            track_id           artists           album_name       
##  Min.   :     0   Length:114000      Length:114000      Length:114000     
##  1st Qu.: 28500   Class :character   Class :character   Class :character  
##  Median : 57000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 57000                                                           
##  3rd Qu.: 85499                                                           
##  Max.   :113999                                                           
##   track_name          popularity      duration_ms        explicit        
##  Length:114000      Min.   :  0.00   Min.   :      0   Length:114000     
##  Class :character   1st Qu.: 17.00   1st Qu.: 174066   Class :character  
##  Mode  :character   Median : 35.00   Median : 212906   Mode  :character  
##                     Mean   : 33.24   Mean   : 228029                     
##                     3rd Qu.: 50.00   3rd Qu.: 261506                     
##                     Max.   :100.00   Max.   :5237295                     
##   danceability        energy            key            loudness      
##  Min.   :0.0000   Min.   :0.0000   Min.   : 0.000   Min.   :-49.531  
##  1st Qu.:0.4560   1st Qu.:0.4720   1st Qu.: 2.000   1st Qu.:-10.013  
##  Median :0.5800   Median :0.6850   Median : 5.000   Median : -7.004  
##  Mean   :0.5668   Mean   :0.6414   Mean   : 5.309   Mean   : -8.259  
##  3rd Qu.:0.6950   3rd Qu.:0.8540   3rd Qu.: 8.000   3rd Qu.: -5.003  
##  Max.   :0.9850   Max.   :1.0000   Max.   :11.000   Max.   :  4.532  
##       mode         speechiness       acousticness    instrumentalness  
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00e+00  
##  1st Qu.:0.0000   1st Qu.:0.03590   1st Qu.:0.0169   1st Qu.:0.00e+00  
##  Median :1.0000   Median :0.04890   Median :0.1690   Median :4.16e-05  
##  Mean   :0.6376   Mean   :0.08465   Mean   :0.3149   Mean   :1.56e-01  
##  3rd Qu.:1.0000   3rd Qu.:0.08450   3rd Qu.:0.5980   3rd Qu.:4.90e-02  
##  Max.   :1.0000   Max.   :0.96500   Max.   :0.9960   Max.   :1.00e+00  
##     liveness         valence           tempo        time_signature 
##  Min.   :0.0000   Min.   :0.0000   Min.   :  0.00   Min.   :0.000  
##  1st Qu.:0.0980   1st Qu.:0.2600   1st Qu.: 99.22   1st Qu.:4.000  
##  Median :0.1320   Median :0.4640   Median :122.02   Median :4.000  
##  Mean   :0.2136   Mean   :0.4741   Mean   :122.15   Mean   :3.904  
##  3rd Qu.:0.2730   3rd Qu.:0.6830   3rd Qu.:140.07   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :0.9950   Max.   :243.37   Max.   :5.000  
##  track_genre       
##  Length:114000     
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

First, let’s check the percentage of NA’s present in each columns of the dataset.

(colMeans(is.na(df_spotify)))*100
##                X         track_id          artists       album_name 
##                0                0                0                0 
##       track_name       popularity      duration_ms         explicit 
##                0                0                0                0 
##     danceability           energy              key         loudness 
##                0                0                0                0 
##             mode      speechiness     acousticness instrumentalness 
##                0                0                0                0 
##         liveness          valence            tempo   time_signature 
##                0                0                0                0 
##      track_genre 
##                0
data_corr<-df_spotify %>% select_if(is.numeric)   
data_corr = subset(data_corr, select = -c(X) )
corrplot.mixed(cor(data_corr))

ggplot(df_spotify, aes(x = popularity, fill=popularity)) +
    geom_bar()

ggplot(df_spotify, aes(x = danceability, fill=danceability)) +
    geom_bar()

acHist <- ggplot(df_spotify, aes(x=acousticness)) + geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, bins = 75)+
  ggtitle("Histogram of Acousticness")
acHist

# scatter plot for acousticness & popularity
acSctr<- ggplot(df_spotify, aes(x=popularity, y= acousticness, color="#e9ecef")) +
  geom_point(size = 0.1) +
  ggtitle("Scatterplot for Acousticness")
acSctr

# thought this violin plot will be better with other variables in violin plots
acV <- ggplot(df_spotify, aes(x=acousticness, y=popularity,fill="#69b3a2")) + 
  geom_violin() +
  ggtitle("Violinplot for Acousticness")
acV

# made basic histogram & scatterplot for every other variables that we didn't include to our SMART Q
speechinessHist <- ggplot(df_spotify, aes(x=speechiness)) + geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, bins = 75)+
  ggtitle("Histogram of speechiness")
speechinessHist

speechinessSct<- ggplot(df_spotify, aes(x=popularity, y= speechiness, color="#e9ecef")) +
  geom_point(size = 0.1) +
  ggtitle("Scatterplot for speechiness")
speechinessSct

instrumentalnessHist <- ggplot(df_spotify, aes(x=instrumentalness)) + geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, bins = 75)+
  ggtitle("Histogram of instrumentalness")
instrumentalnessHist

instrumentalnessSctr<- ggplot(df_spotify, aes(x=popularity, y= instrumentalness, color="#e9ecef")) +
  geom_point(size = 0.1) +
  ggtitle("Scatterplot for instrumentalness")
instrumentalnessSctr

livenessHist <- ggplot(df_spotify, aes(x=liveness)) + geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, bins = 75)+
  ggtitle("Histogram of liveness")
livenessHist

livenessSct<- ggplot(df_spotify, aes(x=popularity, y= liveness, color="#e9ecef")) +
  geom_point(size = 0.1) +
  ggtitle("Scatterplot for liveness")
livenessSct

valenceHist <- ggplot(df_spotify, aes(x=valence)) + geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, bins = 75)+
  ggtitle("Histogram of instrumentalness")
valenceHist

valenceSctr<- ggplot(df_spotify, aes(x=popularity, y= valence, color="#e9ecef")) +
  geom_point(size = 0.1) +
  ggtitle("Scatterplot for valence")
valenceSctr

tempoHist <- ggplot(df_spotify, aes(x=tempo)) + geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9, bins = 75)+
  ggtitle("Histogram of tempo")
tempoHist

tempoSctr<- ggplot(df_spotify, aes(x=popularity, y= tempo, color="#e9ecef")) +
  geom_point(size = 0.1) +
  ggtitle("Scatterplot for tempo")
tempoSctr